# Loading packages
library(readtext)
library(readxl)
library(dplyr)
library(tidyverse)
library(tidytext)
library(tm)
library(textstem)
library(wordcloud)
library(slam)
library(topicmodels)
library(SentimentAnalysis)
The main objective of this paper is to analyse the Federal Open Market Commitee statements using text mining methods and tools provided by R. We start with basic analysis of the length of each statement, through word counts, sentiment analysis and topic modelling.
Federal Open Market Committee (FOMC) is the body of the central bank of United States (the Federal Reserve System). Its main duties is setting the national monetary policy. The FOMC holds eight regularly scheduled meetings per year. At these meetings, the Committee reviews economic and financial conditions, determines the appropriate stance of monetary policy, and assesses the risks to its long-run goals of price stability and sustainable economic growth. The FOMC consists of 12 voting members: seven members of the Board of Governors, the president of the Federal Reserve Bank of New York and 4 of the remaining 11 Reserve Bank presidents, who serve one-year terms on a rotating basis. All 12 of the Reserve Bank presidents attend FOMC meetings and participate in FOMC discussions, but only the presidents who are Committee members at the time may vote on policy decisions. FOMC meetings typically are held eight times each year in Washington, D.C., and at other times as needed.
The Committee releases a public statement immediately after each FOMC meeting. Each statement follows very similar structure. Firstly, the general background of the economic situation is presented. Then the Commitee introduces the value of the established federal funds rate and also share predictions. At the end, there are listed names of people which voted for the FOMC monetary policy action.
We sourced the data by scraping the statements from the Federal Reserve official website 1 using Python. In the scraping algorithm we limited the content only to FOMC announcment, omitting the names of voters listed in the last paragraph. The analysed period includes years from 2006 to 2018 which resulted in obtaining 107 documents.
# Loading scrapped statements
# DATA_DIR <- "C:/Users/KAndr/OneDrive/Studia/II rok I semestr/Text mining/Text mining project/Statements/"
DATA_DIR <- "~/FOMC-text-mining/Statements"
# DATA_DIR <- "~/Desktop/FOMC-text-mining/Statements"
fomc_2006 <- readtext(paste0(DATA_DIR, "/2006/*"))
fomc_2007 <- readtext(paste0(DATA_DIR, "/2007/*"))
fomc_2008 <- readtext(paste0(DATA_DIR, "/2008/*"))
fomc_2009 <- readtext(paste0(DATA_DIR, "/2009/*"))
fomc_2010 <- readtext(paste0(DATA_DIR, "/2010/*"))
fomc_2011 <- readtext(paste0(DATA_DIR, "/2011/*"))
fomc_2012 <- readtext(paste0(DATA_DIR, "/2012/*"))
fomc_2013 <- readtext(paste0(DATA_DIR, "/2013/*"))
fomc_2014 <- readtext(paste0(DATA_DIR, "/2014/*"))
fomc_2015 <- readtext(paste0(DATA_DIR, "/2015/*"))
fomc_2016 <- readtext(paste0(DATA_DIR, "/2016/*"))
fomc_2017 <- readtext(paste0(DATA_DIR, "/2017/*"))
fomc_2018 <- readtext(paste0(DATA_DIR, "/2018/*"))
# Binding data
statements <- rbind(fomc_2006, fomc_2007, fomc_2008, fomc_2009, fomc_2010, fomc_2011,
fomc_2012, fomc_2013, fomc_2014, fomc_2015, fomc_2016, fomc_2017, fomc_2018)
# Removing files from memory
remove(fomc_2006, fomc_2007, fomc_2008, fomc_2009, fomc_2010, fomc_2011,
fomc_2012, fomc_2013, fomc_2014, fomc_2015, fomc_2016, fomc_2017, fomc_2018)
We start our work on statments with the initial preprocessing of the dataset. It consists of two columns: doc_id and text. Doc_id is sourced from each statement’s website link. Text is just a content of the statement.
head(statements, 1)
## readtext object consisting of 1 document and 0 docvars.
## # Description: df[,2] [1 x 2]
## doc_id text
## * <chr> <chr>
## 1 20060131.txt "\"The Federa\"..."
# adding an unique ID
statements <- statements %>% mutate(ID = 1:n())
# setting column names
colnames(statements) <- c("Date", "Text", "ID")
# modification of doc_id column - changing it to date column
statements$Date <- gsub(".txt", "", statements$Date)
statements$Date <- as.Date(statements$Date, "%Y%m%d ")
statements_all <- as.vector(statements$Text)
length(statements_all)
## [1] 107
The next step was concerting the dataset into volatile corpora which is a handful form in the following operations. Below can be seen an example statement before any text preprocessing operations applied.
(corpus_all <- VCorpus(VectorSource(statements_all)))
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 107
inspect(corpus_all[[1]])
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 778
##
## The Federal Open Market Committee decided today to raise its target for the federal funds rate by 25 basis points to 4-1/2 percent. Although recent economic data have been uneven, the expansion in economic activity appears solid. Core inflation has stayed relatively low in recent months and longer-term inflation expectations remain contained. Nevertheless, possible increases in resource utilization as well as elevated energy prices have the potential to add to inflation pressures. The Committee judges that some further policy firming may be needed to keep the risks to the attainment of both sustainable economic growth and price stability roughly in balance. In any event, the Committee will respond to changes in economic prospects as needed to foster these objectives.
We start preprocessing with text cleaning using tm_map() function. We lower each case, remove words from the built-in stopwords list, we remove punctuation, unnecessary whitespaces and numbers. At the end we apply PlainTextDocument() function.
corpus_clean <- corpus_all %>%
tm_map(tolower) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(removeNumbers) %>%
tm_map(PlainTextDocument)
Below can be seen examples of the statements after above cleaning steps.
as.character(corpus_clean[[1]])
## [1] " federal open market committee decided today raise target federal funds rate basis points percent although recent economic data uneven expansion economic activity appears solid core inflation stayed relatively low recent months longerterm inflation expectations remain contained nevertheless possible increases resource utilization well elevated energy prices potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives "
In order to ease operations on the corpus, we modify it into a data frame.
df_corpus <- data.frame(text = unlist(sapply(corpus_clean, `[`, "content")), stringsAsFactors = F)
df_corpus <- df_corpus %>% mutate(doc_id = 1:n())
df_corpus$text[1]
## [1] " federal open market committee decided today raise target federal funds rate basis points percent although recent economic data uneven expansion economic activity appears solid core inflation stayed relatively low recent months longerterm inflation expectations remain contained nevertheless possible increases resource utilization well elevated energy prices potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives "
In the next steps, we append statements data frame with cleaned text. We also count number of words occuring in the original statement and in the cleaned statement.
statements_clean <- statements %>%
mutate(cleaned_text = df_corpus$text)
# cleaned_text
count_cleaned_word <- statements_clean %>%
unnest_tokens(word_count, cleaned_text) %>%
count(ID, word_count, sort = T) %>%
group_by(ID) %>%
summarize(word_cleaned_count = sum(n))
statements_clean_count <- left_join(statements_clean, count_cleaned_word, by = 'ID')
count_word <- statements_clean_count %>%
unnest_tokens(word_count, Text) %>%
count(ID, word_count, sort = T) %>%
group_by(ID) %>%
summarize (word_count = sum(n))
statements_final <- left_join(statements_clean_count, count_word, by = 'ID')
On the plot below we can see line plots of word counts over time, one for statmenets before and one after cleaning. The number has increased over time for both types until 2014 when the number started to decrease. The proportion of number of cleaned words to number of total words is quite constant and oscillates around 60%.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
library(viridis)
## Loading required package: viridisLite
myplot <- statements_final %>%
select(Date, word_count, word_cleaned_count) %>%
ggplot() +
geom_line(aes(x = Date,
y = word_count),
color = viridis(10)[3]) +
geom_line(aes(x = Date,
y = word_cleaned_count),
color = viridis(10)[6]) +
labs(x = "Date",
y = "Number of words",
title = "Comparison of number of words between original and cleaned <br>statements content over time") +
scale_x_date(date_breaks = "1 year",
date_labels = "%Y") +
theme_minimal()
ggplotly(myplot)
The Zipf’s law in context of text mining, states that the frequency of a word is inversely proportional to it’s ordered rank. We decided to check the accuracy of the law empirically on our dataset. Below is a dataset with included zipf’s frequency.
library(ggplot2)
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
word_counts_zipf <- statements_clean_count %>%
mutate(year = year(Date)) %>%
unnest_tokens(word_count, cleaned_text) %>%
count( word_count, sort = T)
word_count <- word_counts_zipf# Data frame containing words and their frequency
colnames(word_count) <- c("word", "count")
alpha <- 1 # Change it needed
word_count <- word_count %>%
mutate(word = factor(word, levels = word),
rank = row_number(),
zipfs_freq = ifelse(rank == 1, count, dplyr::first(count) / rank^alpha))
word_count
## # A tibble: 1,174 x 4
## word count rank zipfs_freq
## <fct> <int> <int> <dbl>
## 1 committee 915 1 915
## 2 inflation 834 2 458.
## 3 will 638 3 305
## 4 economic 556 4 229.
## 5 market 446 5 183
## 6 federal 445 6 152.
## 7 rate 418 7 131.
## 8 labor 331 8 114.
## 9 conditions 326 9 102.
## 10 securities 314 10 91.5
## # ... with 1,164 more rows
Additionally we decided to present the visualisation on a point plot. Based on this visualisation we decided to remove words with the highest and lowest values, setting the cutoff to ranks 10 and 300 leaving only words in between.
p1 <- ggplot(word_count,
aes(x = rank, y = count,
color = rank,
text = paste("Word: ", word, "<br>Count: ", count))) +
geom_point() +
labs(x = "rank", y = "count", title = "Zipf's law visualization") +
scale_color_viridis_c() +
geom_vline(xintercept = 10) +
geom_vline(xintercept = 300) +
theme_minimal() +
theme(legend.position = "none")
ggplotly(p1, tooltip = "text")
large_zipf <- as.vector(word_count$word[1:10])
small_zipf <- as.vector(word_count$word[300:1174])
corpus_clean <- corpus_clean %>% tm_map(removeWords, large_zipf)
corpus_clean <- corpus_clean %>% tm_map(removeWords, small_zipf)
df_corpus <- data.frame(text = unlist(sapply(corpus_clean, `[`, "content")), stringsAsFactors = F)
df_corpus <- df_corpus %>% mutate(doc_id = 1:n())
statements_clean <- statements %>%
mutate(cleaned_text = df_corpus$text)
Using term frequency–inverse document frequency statistic we proceded to further analyze the statements to find these words that carry more information than the others.
statements_words <- statements_clean %>%
mutate(year = year(Date)) %>%
unnest_tokens(word_count, cleaned_text) %>%
count(year, word_count, sort = T)
statements_words <- statements_words %>%
bind_tf_idf(word_count, year, n) %>%
arrange(desc(tf_idf))
statements_words
## # A tibble: 2,225 x 6
## year word_count n tf idf tf_idf
## <dbl> <chr> <int> <dbl> <dbl> <dbl>
## 1 2008 bank 14 0.0219 1.47 0.0322
## 2 2009 reserve 24 0.0235 1.18 0.0277
## 3 2007 core 8 0.0181 1.47 0.0265
## 4 2010 bank 21 0.0180 1.47 0.0264
## 5 2006 core 8 0.0179 1.47 0.0262
## 6 2008 central 11 0.0172 1.47 0.0253
## 7 2018 symmetric 14 0.0126 1.87 0.0237
## 8 2009 credit 24 0.0235 0.956 0.0225
## 9 2006 needed 12 0.0268 0.773 0.0207
## 10 2014 asset 34 0.0134 1.47 0.0197
## # ... with 2,215 more rows
Using the computed statistic, we visualised the most important words per year. (…) które najpopularniejsze
pd = statements_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word_count, levels = rev(unique(word_count)))) %>%
group_by(year) %>%
top_n(10) %>%
ungroup() %>%
arrange(year, tf_idf) %>%
mutate(order = row_number())
ggplot(pd, aes(order, tf_idf, fill = tf_idf)) +
geom_bar(show.legend = FALSE, stat = "identity") +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~year, ncol = 3, scales = "free") +
scale_x_continuous(breaks = pd$order,
labels = pd$word,
expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
coord_flip() +
theme_minimal() +
scale_fill_viridis_c(direction=-1)
We also presented the frequency of words using wordclouds. The bigger the word the higher the frequency.
library(wordcloud2)
dtm <- TermDocumentMatrix(corpus_clean)
m <- as.matrix(dtm)
v <- sort(rowSums(m), decreasing=TRUE)
d <- data.frame(word = names(v), freq=v)
head(d, 10)
## word freq
## percent percent 305
## policy policy 280
## funds funds 279
## financial financial 213
## range range 206
## agency agency 203
## employment employment 203
## pace pace 198
## information information 185
## target target 180
set.seed(1234)
wordcloud2(d %>% arrange(desc(freq)) %>% head(100), color=viridis(100, direction = -1), shape='circle', size=0.2, minRotation = -pi/2, ellipticity = .8)
# Lemmatization
statements_clean$lemma_text <- lemmatize_strings(statements_clean$cleaned_text)
# Tokenization
tokens <- statements_clean %>%
unnest_tokens(word, lemma_text)
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(8, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE, fill = viridis(40)) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
labs(y="Value of beta parameter", x="") +
scale_x_reordered() +
theme_minimal()
library(viridis)
topic_density <- ggplot(statements_topics, aes(x=Date)) +
geom_line(aes(y=topic1), color=viridis(10)[2]) +
geom_line(aes(y=topic2), color=viridis(10)[4]) +
geom_line(aes(y=topic3), color=viridis(10)[6]) +
geom_line(aes(y=topic4), color=viridis(10)[8]) +
geom_line(aes(y=topic5), color=viridis(10)[9]) +
# geom_line(aes(y=topic6), color=viridis(10)[10]) +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
theme_minimal()
ggplotly(topic_density)
by_chapter_word <- data.frame(lapply(by_chapter_word, as.character), stringsAsFactors=FALSE)
# install.packages("widyr")
library(widyr)
# count words co-occuring within sections
word_pairs <- by_chapter_word %>%
pairwise_count(word, ID, sort = TRUE)
word_pairs
## # A tibble: 77,600 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 target open 104
## 2 funds open 104
## 3 open target 104
## 4 funds target 104
## 5 open funds 104
## 6 target funds 104
## 7 information open 97
## 8 open information 97
## 9 outlook open 96
## 10 outlook target 96
## # ... with 77,590 more rows
word_cors <- by_chapter_word %>%
group_by(word) %>%
filter(n() >= 107) %>%
pairwise_cor(word, ID, sort = TRUE)
word_cors <- word_cors %>% filter(correlation<=1)
library(igraph)
library(ggraph)
word_cors %>%
filter(correlation > .65) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_label(aes(label = name), repel = TRUE) +
theme_void()